# Objective of this notebook:
1. Explain Notebook
2. Serve as stepping stone to allow function to be reproduced across TS Pred Domains

### Competition Explanation
1. Leaderboard score: 6476
2. Notebook Score: 9766, which served as baseline and building block to notebook of score: 11,480
3. Notebook Credit: https://www.kaggle.com/code/tarlannazarov/own-jane-street-with-keras-nn/notebook

#### From : 1_MLP Model_Part 3 - Global Function

In [1]:
# Print Tensorflow, Keras, Numpy version to make version clear to prevent dependecy issues
import tensorflow
import keras
import numpy 
print(tensorflow.__version__)
print(keras.__version__)
print(numpy.__version__)

2.7.0-dev20210806
2.6.0
1.19.5


In [37]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices

class Kaggle_custom():
    def __init__(self, n_jobs=-1, verbose=0):
        self.n_jobs = n_jobs # -1: all CPUs are used
        self.verbose = verbose

    def JaneStreet_MLP_model_1(self, df, date, target_list, features_list,
                            
                            train_test_split_size,
                            fillna_type,
                            
                            batch_size,
                            hidden_units,
                            dropout_rates,
                            label_smoothing,
                            learning_rate,
                            epochs,                            
                            ):
        
        '''
        format:
        :param Parameter: [type]: {Example or Explanation}
        
        :param df: [pandas DataFrame]: {DataFrame}
        :param date: Not Required, for future input reference
        :param target_list: [List] : {target(s) column list}
        :param features_list: [List] : {features column list}

        :param batch_size: [int] : {5000}
        :param hidden_units: [List] : {[150, 150, 150]}
        :param dropout_rates: [List] : {[0.2, 0.2, 0.2, 0.2]}
        :param label_smoothing: [exponential notation] : {1e-2}
        :param learning_rate: [exponential notation] : {1e-3}
        :param epochs: [int] : {2000}   
        
        '''
        
        print('[Warning] This is a MULTI VARIATE Time Series Prediction')

        from sklearn.model_selection import train_test_split
        #Param PreProcessing
        train, test = train_test_split(df, test_size=train_test_split_size, shuffle=False)
        features = features_list
        
        if fillna_type == 'mean':
            train.fillna(train.mean(),inplace=True)
        else:
            #input custom lambda function or fill_value function
            fill_value = train.fillna(fill_value, inplace=True)

        train['action'] = ((train['resp'].values) > 0).astype(int)

#         f_mean = np.mean(train[features[1:]].values,axis=0) #Original
        f_mean = np.mean(train[features].values,axis=0) # preferred

        X_train = train.loc[:, train.columns.str.contains('feature')]
        y_train = np.stack([(train[c] > 0).astype('int') for c in target_list]).T

        def create_mlp(
            num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
        ):

            inp = tf.keras.layers.Input(shape=(num_columns,))
            x = tf.keras.layers.BatchNormalization()(inp)
            x = tf.keras.layers.Dropout(dropout_rates[0])(x)
            for i in range(len(hidden_units)):
                x = tf.keras.layers.Dense(hidden_units[i])(x)
                x = tf.keras.layers.BatchNormalization()(x)
                x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
                x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)

            x = tf.keras.layers.Dense(num_labels)(x)
            out = tf.keras.layers.Activation("sigmoid")(x)

            model = tf.keras.models.Model(inputs=inp, outputs=out)
            model.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
                metrics=tf.keras.metrics.AUC(name="AUC"),
            )

            return model

        #Create Model
        clf = create_mlp(len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate)
        clf.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)#epochs=200

        models = []
        models.append(clf)

#### Sample Input Parameters

In [7]:
import pandas as pd
df = pd.read_csv('../../../git_datasets/Jane Street Data/Splitted_Data/JaneStreet_Part0.csv',index_col = 0)
features_list = [c for c in df.columns if "feature" in c]
date = 'date' 
target_list = 'resp'
feature = ['feature1','feature2'] 
target_list = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
train_test_split_size = 0.2
fillna_type = 'mean'      #Choose fill Type 1 
df

# fill_value = train.mean() #Choose fill Type 2

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.049202,0.002514,0.004059,0.015541,0.024346,0.017530,1,-2.541864,0.457145,...,-1.396750,-1.740766,0.984241,1.055600,-0.079467,-2.650392,-1.981605,1.783659,2.866631,87701
1,0,0.180770,-0.000232,0.000292,0.001638,0.002670,0.000906,1,1.435043,-0.801688,...,2.363126,3.088412,-0.680165,-1.452415,-0.482993,0.021540,0.460060,3.930716,-1.233066,131161
2,0,0.807942,0.000639,0.001136,0.002258,0.000157,-0.000691,1,-1.615491,3.070859,...,1.650663,4.536699,-2.317647,0.789400,0.455739,1.509873,-0.890321,-0.652885,1.805953,1964156
3,0,0.000000,0.003162,0.002086,-0.002512,-0.014387,-0.008368,-1,-3.172026,1.501052,...,-0.743715,0.153879,0.180128,-0.653379,0.809250,-3.604416,-1.060510,4.761398,2.454472,1360299
4,0,0.217674,-0.000255,-0.000340,0.001560,0.004779,0.003217,-1,-3.172026,-0.515529,...,-0.277117,1.552482,-0.348023,-0.944887,3.885566,1.968201,-1.015298,0.642338,0.242971,1024723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194617,41,9.499657,-0.000010,-0.000433,-0.002241,-0.003893,-0.000241,-1,2.338139,-1.390194,...,-0.667969,1.654635,-3.354002,-1.561084,-0.021235,0.304516,0.578562,2.351930,-1.043938,2245831
194618,41,0.880071,-0.009220,-0.010878,-0.016373,-0.005701,-0.004407,-1,3.033749,-1.119539,...,-0.142546,-0.441910,2.025904,1.111816,-0.007807,-0.383317,-0.112840,-0.702947,-0.997980,1805522
194619,41,0.000000,0.005172,0.004708,0.009887,0.013415,0.008206,1,4.933404,0.358317,...,0.265527,0.414859,-0.555553,1.416021,0.502948,-1.193979,-1.451042,1.141000,-1.156276,1742984
194620,41,0.078104,-0.001270,-0.001045,0.006422,0.016154,0.010319,1,2.024954,-0.685283,...,0.665552,0.218148,0.648862,-1.365715,0.255495,0.601280,3.224246,-0.650631,-1.370604,887604


### Observe Input Params: [df, date, target_list, features_list,]

In [8]:
df

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.049202,0.002514,0.004059,0.015541,0.024346,0.017530,1,-2.541864,0.457145,...,-1.396750,-1.740766,0.984241,1.055600,-0.079467,-2.650392,-1.981605,1.783659,2.866631,87701
1,0,0.180770,-0.000232,0.000292,0.001638,0.002670,0.000906,1,1.435043,-0.801688,...,2.363126,3.088412,-0.680165,-1.452415,-0.482993,0.021540,0.460060,3.930716,-1.233066,131161
2,0,0.807942,0.000639,0.001136,0.002258,0.000157,-0.000691,1,-1.615491,3.070859,...,1.650663,4.536699,-2.317647,0.789400,0.455739,1.509873,-0.890321,-0.652885,1.805953,1964156
3,0,0.000000,0.003162,0.002086,-0.002512,-0.014387,-0.008368,-1,-3.172026,1.501052,...,-0.743715,0.153879,0.180128,-0.653379,0.809250,-3.604416,-1.060510,4.761398,2.454472,1360299
4,0,0.217674,-0.000255,-0.000340,0.001560,0.004779,0.003217,-1,-3.172026,-0.515529,...,-0.277117,1.552482,-0.348023,-0.944887,3.885566,1.968201,-1.015298,0.642338,0.242971,1024723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194617,41,9.499657,-0.000010,-0.000433,-0.002241,-0.003893,-0.000241,-1,2.338139,-1.390194,...,-0.667969,1.654635,-3.354002,-1.561084,-0.021235,0.304516,0.578562,2.351930,-1.043938,2245831
194618,41,0.880071,-0.009220,-0.010878,-0.016373,-0.005701,-0.004407,-1,3.033749,-1.119539,...,-0.142546,-0.441910,2.025904,1.111816,-0.007807,-0.383317,-0.112840,-0.702947,-0.997980,1805522
194619,41,0.000000,0.005172,0.004708,0.009887,0.013415,0.008206,1,4.933404,0.358317,...,0.265527,0.414859,-0.555553,1.416021,0.502948,-1.193979,-1.451042,1.141000,-1.156276,1742984
194620,41,0.078104,-0.001270,-0.001045,0.006422,0.016154,0.010319,1,2.024954,-0.685283,...,0.665552,0.218148,0.648862,-1.365715,0.255495,0.601280,3.224246,-0.650631,-1.370604,887604


In [9]:
date

'date'

In [10]:
target_list

['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

In [11]:
features_list

['feature_0',
 'feature_1',
 'feature_2',
 'feature_3',
 'feature_4',
 'feature_5',
 'feature_6',
 'feature_7',
 'feature_8',
 'feature_9',
 'feature_10',
 'feature_11',
 'feature_12',
 'feature_13',
 'feature_14',
 'feature_15',
 'feature_16',
 'feature_17',
 'feature_18',
 'feature_19',
 'feature_20',
 'feature_21',
 'feature_22',
 'feature_23',
 'feature_24',
 'feature_25',
 'feature_26',
 'feature_27',
 'feature_28',
 'feature_29',
 'feature_30',
 'feature_31',
 'feature_32',
 'feature_33',
 'feature_34',
 'feature_35',
 'feature_36',
 'feature_37',
 'feature_38',
 'feature_39',
 'feature_40',
 'feature_41',
 'feature_42',
 'feature_43',
 'feature_44',
 'feature_45',
 'feature_46',
 'feature_47',
 'feature_48',
 'feature_49',
 'feature_50',
 'feature_51',
 'feature_52',
 'feature_53',
 'feature_54',
 'feature_55',
 'feature_56',
 'feature_57',
 'feature_58',
 'feature_59',
 'feature_60',
 'feature_61',
 'feature_62',
 'feature_63',
 'feature_64',
 'feature_65',
 'feature_66',
 'fea

# Working Code

In [38]:
# df = pd.read_csv('./Jane Street Data/Splitted_Data/JaneStreet_Part0.csv',index_col = 0)
# features_list = [c for c in df.columns if "feature" in c]
# date = 'date' 
# target_list = 'resp'
# feature = ['feature1','feature2'] 
# target_list = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
# train_test_split_size = 0.2
# fillna_type = 'mean'      #Choose fill Type 1 
# # fill_value = train.mean() #Choose fill Type 2

# Kaggle_custom().JaneStreet_MLP_model_1(df, date, target_list, features_list,
                                       
#                                        train_test_split_size=0.2,
#                                        fillna_type=fillna_type,
#                                         batch_size = 5000,
#                                         hidden_units = [150, 150, 150],
#                                         dropout_rates = [0.2, 0.2, 0.2, 0.2],
#                                         label_smoothing = 1e-2,
#                                         learning_rate = 1e-3,
#                                         epochs=2#2000
#                                     )        



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Epoch 1/2
Epoch 2/2
