# Grocery Problem: Neural Network Model

In this notebook, we attempt to train models on the data and using cross validation, attempt to find the best model for the job.

# Imports

In [1]:
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import date
import keras.backend as K
import tensorflow as tf


from IPython.display import display # extract a feature record from each date
from sklearn import metrics
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.regularizers import l2
from keras.wrappers.scikit_learn import KerasRegressor

from multiprocessing import Pool, cpu_count
from multiprocessing.pool import ThreadPool

Using TensorFlow backend.


## Load Data
We reload the preprocessed data from the previous notebooks.

In [2]:
with open("data/groceries/groceries_dataset.npz", "rb") as f:
    dataset = np.load(f)
    train_ins, train_outs, test_ins = dataset["train_ins"], dataset["train_outs"], dataset["test_ins"]

In [3]:
print(f"Loaded {len(train_ins)} of training examples")

Loaded 10000000 of training examples


## Reshape Data
Since our model is based on LSTMs, we will have to reshape our data from [`example`, `feature`] dimension to [`example`, `series`, `feature`] dimensions.  

The `series` dimention is designed for multiple points in a series, like words in a sentence or frames in a video, we only have one continuous time series. 

Hence we manually take samples to create series: for each output, take the current input and last `t` inputs, where `t`  is another hyperparameter we have to tune.

In [4]:
# Create input samples of size series_size for last example in series indexed by input_i
# iterating possible outputs and collecting corresponding inputs (current and prior)
# included in series size
def create_sample(packet):
    inputs, input_i, series_size = packet
    series_range = range(input_i - series_size+1, input_i+1)
    # reduce size of sample by downcasting to 16 bits 
    return inputs[series_range, :].astype("float16")


# Create samples from the dataset (inputs, outputs) suitable for fitting using a RNN
# Transforms input data of dimension (example, feature) to (sample, series, feature)
# returns the transformed sample outputs
def create_samples(inputs, outputs, sample_size=10000, series_size=30):
    assert len(inputs.shape) == 2
    assert len(inputs) == len(outputs)
    
    # we start sampling from sample_size as require at least
    # sample_size examples to present for each sample
    n_examples = len(inputs)
    sampling_range = np.random.uniform(series_size, n_examples,
                                       size=sample_size).astype("int")

    packets = [(inputs, input_i, series_size) for input_i in sampling_range]
    sampled_inputs = np.stack(list(map(create_sample, packets)))
    
    # collect outputs
    sampled_outputs = outputs[sampling_range]
    
    return sampled_inputs, sampled_outputs

In [5]:
%%time 
sampled_train_ins, sampled_train_outs = create_samples(train_ins, train_outs,
                                                       sample_size=500000,
                                                       series_size=40)

CPU times: user 15.1 s, sys: 1.4 s, total: 16.5 s
Wall time: 16.5 s


In [6]:
sampled_train_ins.shape

(500000, 40, 68)

## Validation-Train Split
We split the data into train and validation subsets so that we can train on training set and validate model on validation set. We do not shuffle because the data is time series.

We include 20,000 examples in our validation set and leave the rest for training

In [7]:
%%time
# cross validation split with no shuffling because data is time series
train_ins, valid_ins, train_outs, valid_outs = train_test_split(
    sampled_train_ins, sampled_train_outs, test_size=20000, shuffle=False)

CPU times: user 271 ms, sys: 564 ms, total: 836 ms
Wall time: 835 ms


In [8]:
print(f"{len(train_ins)} training examples, {len(valid_ins)}  validation examples")

480000 training examples, 20000  validation examples


## Building the model
We build a simple function to abstract anyway the internal model to simplify hyperparameter tuning

In [54]:
# Build a recurrent nerual network regression model with the given input shape,
# and compile the model to be ready for training 
# no of hidden layers, no of hidden units per hidden layer.
# uses the given activation function if given, else Relu
# adds regularisation to the model if reg_lambda is non-zero
# uses the given loss function if given, else mse
# Returns the constructed model
def build_neural_net_regressor(input_shape,
                               n_layers,
                               n_units,
                               learning_rate,
                               lr_decay=0,
                               loss="mse",
                               metrics=["mse", "mae"],
                               activation="tanh",
                               reg_lambda=0, 
                               dropout_prob=0):
    K.clear_session()

    model = Sequential()
    regularizer = l2(l=reg_lambda)
    ## input layer
    model.add(LSTM(n_units, 
                    return_sequences=True,
                    input_shape=input_shape, 
                    activation=activation,
                    kernel_regularizer=regularizer))

    ## hidden layer(s)
    for i in range(1, n_layers + 1):
        # Return sequences for middle layers to be able to stack them
        # but return last sequence only for output dense layer
        if i != n_layers: # Middle hidden layer(s)
            model.add(LSTM(n_units,
                       return_sequences=True,
                       kernel_regularizer=regularizer))
        else: # Last hidden layer(s)
            model.add(LSTM(n_units,
                          kernel_regularizer=regularizer))
            
        # activation function
        model.add(Activation(activation))
        # regularisation: dropout
        if dropout_prob:
            model.add(Dropout(dropout_prob))
    
    ## output layer - regression of one real value
    # relu activation is used to clip zero outputs
    model.add(Dense(1,
                    kernel_regularizer=regularizer))
    
    # compile model
    optimizer = Adam(lr=learning_rate, decay=lr_decay)
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics)
    
    return model

In [76]:
input_shape = train_ins.shape[1:]
model = build_neural_net_regressor(input_shape=input_shape,
                                   n_layers=3,
                                   n_units=,
                                   activation="relu",
                                   learning_rate=3e-3,
                                   lr_decay=0)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 40, 64)            34048     
_________________________________________________________________
lstm_2 (LSTM)                (None, 40, 64)            33024     
_________________________________________________________________
activation_1 (Activation)    (None, 40, 64)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 40, 64)            33024     
_________________________________________________________________
activation_2 (Activation)    (None, 40, 64)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                33024     
_________________________________________________________________
activation_3 (Activation)    (None, 64)                0         
__________

## Training the model
We now proceed to train the model with the training data:

In [77]:
history = model.fit(train_ins, train_outs,
                    batch_size=4069,
                    epochs=30,
                    validation_data=(valid_ins, valid_outs))

Train on 480000 samples, validate on 20000 samples
Epoch 1/30
Epoch 2/30
 36621/480000 [=>............................] - ETA: 57s - loss: 383.4537 - mean_squared_error: 383.4537 - mean_absolute_error: 6.8474

KeyboardInterrupt: 

## Evaluating the model
We evaluate the model by plotting its learning curve

In [None]:
n_epochs = len(history.history["loss"])

plt.title("Learning Curve - Loss")
plt.xlabel("Epochs")
plt.plot(range(n_epochs), history.history["loss"], label="Training loss")
plt.plot(range(n_epochs), history.history["val_loss"], label="Validation loss")
plt.legend()

We define the metric used to evaluate models in the competition:

In [43]:
def root_mean_squared_log_error(y_true, y_pred):
    return metrics.mean_squared_log_error(y_true, y_pred) ** 0.5

In [44]:
predictions = model.predict(valid_ins, batch_size=4069)

In [45]:
root_mean_squared_log_error(np.abs(valid_outs), np.abs(predictions))

1.0106141529904826