In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

In [2]:
# load numpy array from csv file
from numpy import loadtxt
# load array
X_train = loadtxt('x_train2.csv', delimiter=',')
Y_train = loadtxt('y_train2.csv', delimiter=',')
# print the array
X_train

array([[2., 6., 9., ..., 0., 0., 0.],
       [2., 6., 6., ..., 0., 0., 0.],
       [2., 6., 6., ..., 0., 0., 0.],
       ...,
       [2., 6., 6., ..., 0., 0., 0.],
       [2., 6., 6., ..., 0., 0., 0.],
       [2., 6., 9., ..., 0., 0., 0.]])

In [3]:
#reducing the size of the input length so it can train on a CPU
X_train = X_train[:, :100]
X_train

array([[ 2.,  6.,  9., ...,  4.,  7.,  4.],
       [ 2.,  6.,  6., ..., 10., 10., 21.],
       [ 2.,  6.,  6., ...,  4.,  5.,  4.],
       ...,
       [ 2.,  6.,  6., ...,  0.,  0.,  0.],
       [ 2.,  6.,  6., ...,  0.,  0.,  0.],
       [ 2.,  6.,  9., ..., 15.,  7.,  7.]])

In [4]:
Y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [5]:
VOCAB_SIZE = 1254
INPUT_LENGTH = 100 #1000
EMBEDDING_DIM = 128

In [6]:
from keras import backend as K
from keras.layers import Layer
from keras import initializers, regularizers, constraints

In [7]:
# custom dot product function
def dot_product(x, kernel):
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

In [8]:
# find a way to return attention weight vector a
class AttentionWithContext(Layer):
    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        # initialization of all learnable params
        self.init = initializers.get('glorot_uniform')
        
        # regularizers for params, init as None
        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
        
        # constraints for params, init as None
        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
#         assert len(input_shape) == 3
        
        # weight matrix
        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        # bias term
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        
        # context vector
        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)
        
    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)
        
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
#         a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [9]:
# model
def build_model(vocab_size, embedding_dim, input_length):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=input_length))
    model.add(SpatialDropout1D(0.2))
    model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
    model.add(AttentionWithContext())
    model.add(Dense(41, activation='softmax'))
    return model

In [10]:
model = build_model(VOCAB_SIZE, EMBEDDING_DIM, INPUT_LENGTH)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 128)          160512    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 100, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 256)          263168    
_________________________________________________________________
attention_with_context_1 (At (None, 256)               66048     
_________________________________________________________________
dense_1 (Dense)              (None, 41)                10537     
Total params: 500,265
Trainable params: 500,265
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 1800 samples, validate on 200 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
example_x = X_train[0]
print(np.shape(example_x))
temp = model.predict(X_train[0:100])
# print(len(temp)), temp
print(temp[0])
for i in temp:
    print(np.argmax(i))


(100,)
[1.96912885e-02 3.30536067e-03 1.18446974e-02 1.24880604e-01
 1.38624683e-02 1.84409216e-01 1.29668726e-04 2.10610358e-03
 9.58471298e-02 2.45568197e-04 5.69353532e-03 1.58555893e-04
 2.15168251e-03 5.17400727e-02 1.26485684e-04 8.69503096e-02
 3.24413739e-02 7.59687508e-04 2.68192409e-04 4.88667004e-03
 1.33599853e-04 1.94943583e-04 4.46553677e-02 8.88536219e-04
 7.46090189e-02 1.02181709e-03 2.04121266e-02 9.89629701e-03
 3.77784148e-02 4.96099005e-03 1.45957267e-04 1.75513625e-02
 3.75196673e-02 3.13366181e-03 6.33273320e-03 8.03843606e-04
 3.80044710e-03 7.79126063e-02 8.79563019e-03 6.35619694e-03
 1.59806700e-03]
5
3
5
5
5
3
5
5
3
3
5
3
5
3
3
3
5
5
5
5
5
5
5
5
5
5
5
5
3
3
5
3
3
5
5
3
5
3
5
5
5
3
5
5
5
5
5
5
3
3
5
5
5
3
5
3
5
5
3
3
5
5
5
5
5
5
5
5
5
5
5
3
5
3
5
5
5
5
5
3
5
5
5
3
5
5
5
3
5
5
5
5
3
5
3
5
5
5
5
5
