# Model Definition 3

In this definition, we're going to trim the fat, so to speak, off of our neural networks. We'll be pairing the CNN and MLP each down to only one hidden layer to see how performance is impacted.

Let's go!

In [3]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

import plaidml.keras as pk
pk.install_backend()

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

from keras.models import Sequential, Model, load_model
from keras.layers import Dense, Conv1D, Dropout, LeakyReLU, MaxPooling1D, Embedding, Flatten, Input, Concatenate

### Reading in training and validation data

In [4]:
with open('./sequence_data.pickle', 'rb') as f:
    sequence_data = pickle.load(f)
    
with open('./numerical_data.pickle', 'rb') as f:
    numeric_data = pickle.load(f)

In [5]:
X_seq, y = sequence_data

In [6]:
X_num, y = numeric_data

In [7]:
print(X_seq.shape)
print(X_num.shape)
print(y.shape)

(68486, 5)
(68486, 8)
(68486,)


In [8]:
X = np.concatenate([X_num, X_seq], axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.33)

In [9]:
xs_train, xs_test, ys_train, ys_test = train_test_split(X_seq, y, test_size=.33)
xn_train, xn_test, yn_train, yn_test = train_test_split(X_num, y, test_size=.33)

### DeepLearning Methods:

In [10]:
vocab_size = 300
max_length = 5
batch_size = 32

### Mixed Model

In [20]:
def build_model():

    # cnn with only 1 dense layer
    seq_input = Input(shape=(max_length,))

    x = Embedding(vocab_size, 3, input_length=max_length)(seq_input)

    x = Conv1D(256, kernel_size=3, strides=1)(x)
    x = LeakyReLU()(x)
    x = MaxPooling1D(pool_size=2)(x)

    x = Flatten()(x)

    seq_output = Dense(64, activation='relu')(x)

    cnn = Model(inputs=seq_input, outputs=seq_output)

    # mlp with only one dense layer
    num_input = Input(shape=(8,))

    mlp_output = Dense(64, activation='relu')(num_input)

    mlp = Model(inputs=num_input, outputs=mlp_output)

    # combine
    combined = Concatenate()([cnn.output, mlp.output])
    
    z = Dense(512)(combined)
    z = LeakyReLU()(z)
    z = Dropout(.5)(z)
    
    z = Dense(512)(z)
    z = LeakyReLU()(z)
    z = Dropout(.2)(z)
    
    z = Dense(256)(z)
    z = LeakyReLU()(z)
    z = Dropout(.2)(z)
    
    z = Dense(64)(z)
    z = LeakyReLU()(z)
    z = Dropout(.2)(z)
    
    output = Dense(1, activation='sigmoid')(z)

    final_model = Model(inputs=mlp.inputs + cnn.inputs, outputs=[output])

    final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    
    return final_model

In [21]:
model = build_model()

In [22]:
num_train = x_train[:, :8]
seq_train = x_train[:, 8:]

num_test = x_test[:, :8]
seq_test = x_test[:, 8:]

xc_train = [num_train, seq_train] 
xc_test = [num_test, seq_test] 

In [23]:
learning_rate_reduction_combined = ReduceLROnPlateau(monitor='val_acc', patience=3, 
                                            verbose=2, factor=0.5, min_lr=0.00001)

best_model_combined = ModelCheckpoint('./combined_cnn_mlp_model.3.h5', monitor='val_acc', verbose=2, 
                             save_best_only=True, mode='max')

early_stopping_combined = EarlyStopping(monitor='val_loss', min_delta=1e-10, 
                               patience=10, restore_best_weights=True)

In [24]:
hist = model.fit(xc_train, y_train,
         batch_size=batch_size,
         epochs=50,
         validation_data=(xc_test, y_test),
         callbacks = [learning_rate_reduction_combined, best_model_combined, early_stopping_combined],
         verbose=1
)

Train on 45885 samples, validate on 22601 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.89461, saving model to ./combined_cnn_mlp_model.3.h5
Epoch 2/50

Epoch 00002: val_acc improved from 0.89461 to 0.89908, saving model to ./combined_cnn_mlp_model.3.h5
Epoch 3/50

Epoch 00003: val_acc did not improve from 0.89908
Epoch 4/50

Epoch 00004: val_acc improved from 0.89908 to 0.90027, saving model to ./combined_cnn_mlp_model.3.h5
Epoch 5/50

Epoch 00005: val_acc improved from 0.90027 to 0.90421, saving model to ./combined_cnn_mlp_model.3.h5
Epoch 6/50

Epoch 00006: val_acc did not improve from 0.90421
Epoch 7/50

Epoch 00007: val_acc did not improve from 0.90421
Epoch 8/50

Epoch 00008: val_acc improved from 0.90421 to 0.90921, saving model to ./combined_cnn_mlp_model.3.h5
Epoch 9/50

Epoch 00009: val_acc did not improve from 0.90921
Epoch 10/50

Epoch 00010: val_acc improved from 0.90921 to 0.91053, saving model to ./combined_cnn_mlp_model.3.h5
Epoch 11/50

Epoch 00011: 

We actually did get a performance boost by pruning our model! We improved from our best model, scoring 91.0%, to 91.7%.

I think this is about as much information as we can get out of our currently engineered data. Let's do another round of feature engineering and try simply one-hot encoding the route data.