## libs

In [1]:
import yaml
import numpy as np
import my_data
import nn

In [2]:
CONFIG_NAME = "config17.yaml"

## data

In [3]:
%%time
X_train, y_train = my_data.get_data()
X_test, y_test = my_data.get_data_test()
inds = my_data.get_inds()
inds_test = my_data.get_inds_test()

CPU times: user 587 ms, sys: 1.34 s, total: 1.93 s
Wall time: 5.12 s


## select features

In [4]:
selected_features = my_data.get_selected_features(X_train)
X_train = X_train[:,selected_features]
X_test = X_test[:,selected_features]
assert X_train.shape[1] == 519
assert X_test.shape[1] == 519

## config

In [5]:
with open(CONFIG_NAME, "r") as f:
    config = yaml.load(f)

  


## create train and test dataset

In [6]:
generator_config = config['generator']

In [7]:
# y_train = y_train + np.random.normal(scale=0.01, size=y_train.shape[0])

In [8]:
# query_generator = my_data.get_query_generator(inds, X, y, min_length=1, drop_all_zeros=True)
query_generator = my_data.get_query_generator(inds, X_train, y_train, **generator_config)

In [9]:
%%time
data = np.array(list(query_generator))

CPU times: user 10.2 s, sys: 780 ms, total: 11 s
Wall time: 11 s


In [10]:
test_size = config['data']['test_size']

In [11]:
np.random.seed(42)
test_inds = np.random.choice(np.arange(len(data)), test_size, replace=False)
train_inds = ~np.isin(np.arange(len(data)), test_inds)

In [12]:
assert train_inds.sum() + test_inds.size == data.shape[0]
assert test_inds.size == test_size

In [13]:
np.random.seed(42)
data_generator_train = nn.DataGenerator(data[train_inds,:])
data_generator_test = nn.DataGenerator(data[test_inds,:])

[ 6444  7051    42 16719  4401]
[130  10 477 462  21]


In [14]:
# [ 6444  7051    42 16719  4401]
# [130  10 477 462  21]

In [15]:
build_config = config['build']
metric_config = config['metric']
callback_config = config.get('callback', {})

In [16]:
build_config

{'optimizer': 'adam',
 'input_dim': 519,
 'layer_dims': [512, 128, 32],
 'loss_function': 'lambdarank',
 'epochs': 100,
 'dropout': 0.5}

In [17]:
callback_config

{}

In [18]:
optimizer = nn.get_optimizer(build_config['optimizer'])
model = nn.get_model(build_config['input_dim'], build_config['layer_dims'], build_config.get('dropout'))
loss_function = nn.get_loss_function(build_config['loss_function'])
basic_metric = nn.get_metric(**metric_config)
model_name = config['name']
if len(callback_config) == 0:
    callbacks = nn.get_callbacks(model_name, monitor=None, patience=None)
else:
    callbacks = nn.get_callbacks(model_name, **callback_config)

model.compile(
    optimizer=optimizer,
    loss=loss_function, 
    metrics=[basic_metric]
)

In [19]:
# query_generator_sample = my_data.get_query_generator(inds, X, y, equal_length=4, drop_all_zeros=True, only_one_sample=True)
# model.fit(x=query_generator_sample, epochs=1)

In [20]:
CONFIG_NAME

'config17.yaml'

In [21]:
fit_history = model.fit(data_generator_train, epochs=build_config['epochs'], validation_data=data_generator_test, callbacks=callbacks)

  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 18633 steps, validate for 500 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epo

In [22]:
import pandas as pd
pd.DataFrame(fit_history.history).to_csv(f"./history/{config['name']}.csv")

In [23]:
model.save_weights(f"weights/{config['name']}")

In [24]:
%%time
query_generator = my_data.get_query_generator(inds_test, X_test, y_test, min_length=0, drop_all_zeros=False, only_one_sample=False)
preds = model.predict(query_generator)
df_res = my_data.get_submit(preds, inds_test)

CPU times: user 31.1 s, sys: 1.82 s, total: 32.9 s
Wall time: 25.3 s


In [25]:
df_res.to_csv(f"data/{config['name']}.csv", index=False)

In [26]:
message = config['description']
fname = config['submit']

In [27]:
# !kaggle competitions submit -c learning-to-rank-made-fall-2019 -f $fname -m $message